{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "from google.cloud import storage\n",
    "\n",
    "os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/husein/t5/prepare/mesolitica-tpu.json'\n",
    "client = storage.Client()\n",
    "bucket = client.bucket('mesolitica-tpu-general')\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "import malaya\n",
    "\n",
    "def combine(splitted):\n",
    "    points = [f'{no + 1}. {malaya.text.function.transformer_textcleaning(s)}' for no, s in enumerate(splitted)]\n",
    "    points = ' '.join(points)\n",
    "    return points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('keywords-headline.json') as fopen:\n",
    "    data = json.load(fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['ZLD Polres Sekadau',\n",
       "  'Polres Sekadau aksi',\n",
       "  'sembako warga panti',\n",
       "  'Sekadau aksi bakti',\n",
       "  'warga panti asuhan'],\n",
       " ['Ikatan alumni bintara polisi angkatan ke-27',\n",
       "  'warga lantaran sulitnya ekonomi ditengah',\n",
       "  'ide pemberian bantuan sembako muncul',\n",
       "  '25 paket sembako berisi beras',\n",
       "  'panti asuhan Harapan Bunda'],\n",
       " 'Ikatan alumni bintara polisi angkatan ke-27 tahun 2005 gelombang II (ZLD) Polres Sekadau, melakukan aksi bakti sosial dengan membagikan paket sembako kepada warga kurang mampu dan panti asuhan. Sebanyak 25 paket sembako berisi beras, gula, tepung, minyak goreng dan mie instan diserahkan langsung secara door to door kepada warga yang sebelumnya telah didata, untuk menghindari kerumunan.']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1. Ikatan alumni bintara polisi angkatan ke-27 2. warga lantaran sulitnya ekonomi ditengah 3. ide pemberian bantuan sembako muncul 4. 25 paket sembako berisi beras 5. panti asuhan Harapan Bunda'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "combine(data[0][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 156655/156655 [01:37<00:00, 1613.04it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "before, after = [], []\n",
    "for row in tqdm(data):\n",
    "    before.append(combine(row[0]))\n",
    "    after.append(malaya.text.function.transformer_textcleaning(row[2]))\n",
    "    before.append(combine(row[1]))\n",
    "    after.append(malaya.text.function.transformer_textcleaning(row[2]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('1. ZLD Polres Sekadau 2. Polres Sekadau aksi 3. sembako warga panti 4. Sekadau aksi bakti 5. warga panti asuhan',\n",
       " 'Ikatan alumni bintara polisi angkatan ke-27 tahun 2005 gelombang II (ZLD) Polres Sekadau, melakukan aksi bakti sosial dengan membagikan paket sembako kepada warga kurang mampu dan panti asuhan. Sebanyak 25 paket sembako berisi beras, gula, tepung, minyak goreng dan mie instan diserahkan langsung secara door to door kepada warga yang sebelumnya telah didata, untuk menghindari kerumunan.')"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "before[0], after[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "batches = []\n",
    "batch = 20000\n",
    "for i in range(0, len(before), batch):\n",
    "    index = min(i + batch, len(before))\n",
    "    x = before[i: index]\n",
    "    y = after[i: index]\n",
    "    batches.append((x, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "!rm -rf t5-data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "\n",
    "for i in range(len(batches)):\n",
    "    before = batches[i][0]\n",
    "    after = batches[i][1]\n",
    "    filename = f'headline-generator-{i}.tsv'\n",
    "    with tf.io.gfile.GFile(filename, 'w') as outfile:\n",
    "        for i in range(len(before)):\n",
    "            outfile.write('%s\\t%s\\n' % (before[i], after[i]))\n",
    "            \n",
    "    blob = bucket.blob(f't5-headline-data/{filename}')\n",
    "    blob.upload_from_filename(filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
